#coding:utf-8
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time,os,sys
from selenium.webdriver.common.keys import Keys
import requests
from bs4 import BeautifulSoup

url='https://www.zhihu.com/question/{}'.format(sys.argv[1])
num=150
if len(sys.argv) > 1:
    num=int(sys.argv[2])

chrome_options = Options()
chrome_options.add_argument('--headless')
#chrome_options.add_argument('--single-process')
prefs = {"profile.managed_default_content_settings.images":2}
chrome_options.add_experimental_option("prefs",prefs)

browser=webdriver.Chrome(chrome_options=chrome_options)
browser.get(url)

def download_all_pic(content):
    with open('need_download.txt','a+') as f:
        soup=BeautifulSoup(content,'html.parser')
        for pic in soup.find_all('img'):
            f.write(pic.attrs['src']+'\n')
            #print(pic.attrs['src'])
            pic.attrs['src']=pic.attrs['src'].split('/')[-1]
    return soup.prettify()

title=browser.find_elements_by_css_selector('.QuestionHeader-title')[1].text
#print(title)
os.system('mkdir {}'.format(title))
os.chdir(title)
title_link='<a href={}>{}</a>'.format(url,title)

#expand question
question_content=''
btns=browser.find_elements_by_css_selector('.Button.QuestionRichText-more.Button--plain')
if len(btns) > 0:
    browser.find_element_by_css_selector('.QuestionRichText.QuestionRichText--expandable.QuestionRichText--collapsed').click()
    question_content=browser.find_element_by_css_selector('.QuestionRichText.QuestionRichText--expandable').find_element_by_css_selector('.RichText.ztext').get_attribute('innerHTML')
else:
    try:
        question_content=browser.find_element_by_css_selector('.QuestionRichText.QuestionRichText--collapsed').find_element_by_css_selector('.RichText.ztext').get_attribute('innerHTML')
    except Exception as e:#有时候连描述都没有，只有标题
        question_content='<br/>'#title
    
#print(question_content)   

answers=[]
retry=1
while True:
    get_num = len(browser.find_elements_by_css_selector('.ContentItem.AnswerItem'))
    print(get_num)
    if get_num < num and retry < num:
        #print(len(browser.find_elements_by_css_selector('.ContentItem.AnswerItem')))
        browser.execute_script("window.scrollBy(0,document.body.scrollHeight)")
        browser.execute_script("window.scrollBy(0,100)")
        time.sleep(.3)
        retry+=1
    else:
        for item in browser.find_elements_by_css_selector('.ContentItem.AnswerItem'):
            d={}
            content=item.find_element_by_css_selector('.RichText.ztext.CopyrightRichText-richText')
            d['content']=content.get_attribute('innerHTML')
            try:
                user_name=item.find_element_by_css_selector('.UserLink.AuthorInfo-name').find_element_by_css_selector('.UserLink-link')
                d['user']=user_name.get_attribute('outerHTML')
            except Exception as e:
                user_name='匿名用户'
                d['user']=user_name
            print(d['user'])
            agree_num=item.find_element_by_css_selector('.Button.VoteButton.VoteButton--up').get_attribute('aria-label')
            d['agree_num']=agree_num
            answers.append(d)
        break

browser.quit()
    
with open('book.html','w+') as f:
    f.write(title_link+question_content)
    for x in answers:
        x='<h3 class=chapter>'+x['user']+'</h3>\t'+x['agree_num']+'<br/>'+x['content']+'<HR><HR><HR>'
        f.write(download_all_pic(x)) 

#os.system('sort need_download.txt|uniq > need_download2.txt && wget -i need_download2.txt && rm -f need_download*.txt'.format(title))
os.system('sort need_download.txt|uniq > need_download2.txt && aria2c -s 30 -i need_download2.txt'.format(title))
os.system('ebook-convert book.html book.mobi && mv book.mobi ../mobi/{}.mobi'.format(title,title))

